Still under constructions.
(III) Creating detailed list of IMDb Top Rated Movies
- Read and load each line of source code of all the 250 movies.
- Add Content Rating, Number of Rater, Genre, Budget, Opening Weekend USA, Gross USA and Cumulative Worldwide Gross by reading each movie’s link.
- The data was collected on 2020-10-29.
| Title |
h1 itemprop="name" |
| Year |
Next line of Title |
| Content Rating |
meta itemprop="contentRating" |
| User Rating |
span itemprop="ratingValue" |
| Number of Rater |
itemprop="ratingCount" |
| Genre |
span class="itemprop" itemprop="genre" |
| Budget |
<h4 class="inline">Budget |
| Opening Weekend USA ($) |
<h4 class="inline">Opening Weekend USA |
| Gross USA ($) |
<h4 class="inline">Gross |
| Cumulative Worldwide Gross ($) |
<h4 class="inline">Cumulative |
#Design function to get target information from a single page
#Each input is a website link from `movie_link`
get.target.info=function(input){
temp=readLines(con=input,encoding="UTF-8")
#1. title----
temp.movie_title=temp[grep("h1 itemprop=\"name\"",temp)]
temp.movie_title=strsplit(temp.movie_title,split=">")[[1]][2]
temp.movie_title=strsplit(temp.movie_title,split="&")[[1]][1]
#2. year----
temp.movie_year=temp[grep("h1 itemprop=\"name\"",temp)+1]
temp.movie_year=strsplit(temp.movie_year,split=">")[[1]][2]
temp.movie_year=strsplit(temp.movie_year,split="<")[[1]][1]
#3. content rating----
temp.movie_content_rating=temp[grep("meta itemprop=\"contentRating\"",temp)]
if (length(temp.movie_content_rating)==1){
temp.movie_content_rating=strsplit(temp.movie_content_rating,split=">")[[1]][2]
}
if (length(temp.movie_content_rating)==0){
temp.movie_content_rating="-"
}
#4. user rating----
temp.movie_user_rating=temp[grep("span itemprop=\"ratingValue\"",temp)]
temp.movie_user_rating=strsplit(temp.movie_user_rating,split=">")[[1]][3]
temp.movie_user_rating=strsplit(temp.movie_user_rating,split="<")[[1]][1]
#5. number of rater----
temp.movie_num_rater=temp[grep("itemprop=\"ratingCount\"",temp)]
temp.movie_num_rater=strsplit(temp.movie_num_rater,split=">")[[1]][3]
temp.movie_num_rater=strsplit(temp.movie_num_rater,split="<")[[1]][1]
#6. genre----
temp.movie_genre=temp[grep("span class=\"itemprop\" itemprop=\"genre\"",temp)]
temp.movie_genre.l=length(temp.movie_genre)
for (i in 1:temp.movie_genre.l){
temp.movie_genre[[i]]=strsplit(temp.movie_genre,split=">")[[i]][3]
temp.movie_genre[[i]]=strsplit(temp.movie_genre,split="<")[[i]][1]
}
remove(i,temp.movie_genre.l)
temp.movie_genre=paste(temp.movie_genre,collapse=", ")
#7. budget----
temp.movie_budget=temp[grep("<h4 class=\"inline\">Budget",temp)]
if (length(temp.movie_budget)==1){
temp.movie_budget=strsplit(temp.movie_budget,split=">")[[1]][3]
a=strsplit(temp.movie_budget,split="")[[1]]
if (paste(a[1],a[2],a[3],sep="")=="FRF"){
temp.movie_budget=paste(strsplit(temp.movie_budget,split=" ")[[1]][1],strsplit(temp.movie_budget,split=" ")[[1]][2],sep=" ")
}
if (paste(a[1],a[2],a[3],sep="")=="JPY"){
temp.movie_budget=paste(strsplit(temp.movie_budget,split=" ")[[1]][1],strsplit(temp.movie_budget,split=" ")[[1]][2],sep=" ")
}
if (paste(a[1],a[2],a[3],sep="")=="INR"){
temp.movie_budget=paste(strsplit(temp.movie_budget,split=" ")[[1]][1],strsplit(temp.movie_budget,split=" ")[[1]][2],sep=" ")
}
if (paste(a[1],a[2],a[3],sep="")=="DEM"){
temp.movie_budget=paste(strsplit(temp.movie_budget,split=" ")[[1]][1],strsplit(temp.movie_budget,split=" ")[[1]][2],sep=" ")
}
if (paste(a[1],a[2],a[3],sep="")=="RUR"){
temp.movie_budget=paste(strsplit(temp.movie_budget,split=" ")[[1]][1],strsplit(temp.movie_budget,split=" ")[[1]][2],sep=" ")
}
if (paste(a[1],a[2],a[3],sep="")=="TRL"){
temp.movie_budget=paste(strsplit(temp.movie_budget,split=" ")[[1]][1],strsplit(temp.movie_budget,split=" ")[[1]][2],sep=" ")
}
if (paste(a[1],a[2],a[3],sep="")=="AUD"){
temp.movie_budget=paste(strsplit(temp.movie_budget,split=" ")[[1]][1],strsplit(temp.movie_budget,split=" ")[[1]][2],sep=" ")
}
if (paste(a[1],a[2],a[3],sep="")=="KRW"){
temp.movie_budget=paste(strsplit(temp.movie_budget,split=" ")[[1]][1],strsplit(temp.movie_budget,split=" ")[[1]][2],sep=" ")
}
if (paste(a[1],a[2],a[3],a[4],a[5],a[6],sep="")=="€"){
temp.movie_budget=paste("EUR",substr(temp.movie_budget,start=7,stop=nchar(temp.movie_budget)))
}
if (paste(a[1],a[2],a[3],a[4],a[5],a[6],a[7],sep="")=="£"){
temp.movie_budget=paste("GBP",substr(temp.movie_budget,start=8,stop=nchar(temp.movie_budget)))
}
remove(a)
}
if (length(temp.movie_budget)==0){
temp.movie_budget="-"
}
#8. opening----
temp.movie_opening=temp[grep("<h4 class=\"inline\">Opening Weekend USA",temp)]
if (length(temp.movie_opening)==1){
temp.movie_opening=strsplit(temp.movie_opening,split=">")[[1]][3]
temp.movie_opening=strsplit(temp.movie_opening,split=" ")[[1]][2]
a=strsplit(temp.movie_opening,split="")[[1]]
if (a[length(a)]==","){
temp.movie_opening=substr(temp.movie_opening,start=1,stop=nchar(temp.movie_opening)-1)
}
remove(a)
}
if (length(temp.movie_opening)==0){
temp.movie_opening="-"
}
#9. gross----
temp.movie_gross=temp[grep("<h4 class=\"inline\">Gross",temp)]
if (length(temp.movie_gross)==1){
temp.movie_gross=strsplit(temp.movie_gross,split=">")[[1]][3]
temp.movie_gross=strsplit(temp.movie_gross,split=" ")[[1]][2]
a=strsplit(temp.movie_gross,split="")[[1]]
if (a[length(a)]==","){
temp.movie_gross=substr(temp.movie_gross,start=1,stop=nchar(temp.movie_gross)-1)
}
remove(a)
}
if (length(temp.movie_gross)==0){
temp.movie_gross="-"
}
#10. worldwide gross----
temp.movie_worldwide_gross=temp[grep("<h4 class=\"inline\">Cumulative",temp)]
if (length(temp.movie_worldwide_gross)==1){
temp.movie_worldwide_gross=strsplit(temp.movie_worldwide_gross,split=">")[[1]][3]
temp.movie_worldwide_gross=strsplit(temp.movie_worldwide_gross,split=" ")[[1]][2]
a=strsplit(temp.movie_worldwide_gross,split="")[[1]]
if (a[length(a)]==","){
temp.movie_worldwide_gross=substr(temp.movie_worldwide_gross,start=1,stop=nchar(temp.movie_worldwide_gross)-1)
}
remove(a)
}
if (length(temp.movie_worldwide_gross)==0){
temp.movie_worldwide_gross="-"
}
#11. result----
return(c(temp.movie_title,temp.movie_year,temp.movie_content_rating,temp.movie_user_rating,temp.movie_num_rater,temp.movie_genre,temp.movie_budget,temp.movie_opening,temp.movie_gross,temp.movie_worldwide_gross))
}
#Collecting data----
movie_title=c()
movie_year=c()
movie_content_rating=c()
movie_user_rating=c()
movie_num_rater=c()
movie_genre=c()
movie_budget=c()
movie_opening=c()
movie_gross=c()
movie_worldwide_gross=c()
for (i in 1:250){
temp.target.info=get.target.info(movie_link[i])
movie_title=c(movie_title,temp.target.info[1])
movie_year=c(movie_year,temp.target.info[2])
movie_content_rating=c(movie_content_rating,temp.target.info[3])
movie_user_rating=c(movie_user_rating,temp.target.info[4])
movie_num_rater=c(movie_num_rater,temp.target.info[5])
movie_genre=c(movie_genre,temp.target.info[6])
movie_budget=c(movie_budget,temp.target.info[7])
movie_opening=c(movie_opening,temp.target.info[8])
movie_gross=c(movie_gross,temp.target.info[9])
movie_worldwide_gross=c(movie_worldwide_gross,temp.target.info[10])
}
#Visualization----
library(knitr)
y=data.frame(movie_rank,movie_title,movie_year,movie_content_rating,movie_user_rating,movie_num_rater,movie_genre,movie_budget,movie_opening,movie_gross,movie_worldwide_gross)
y$movie_rank=as.character(movie_rank)
y$movie_title=as.character(movie_title)
y$movie_year=as.character(movie_year)
y$movie_content_rating=as.character(movie_content_rating)
y$movie_user_rating=as.character(movie_user_rating)
y$movie_num_rater=as.character(movie_num_rater)
y$movie_genre=as.character(movie_genre)
y$movie_budget=as.character(movie_budget)
y$movie_opening=as.character(movie_opening)
y$movie_gross=as.character(movie_gross)
y$movie_worldwide_gross=as.character(movie_worldwide_gross)
kable(y,align="c",col.names=c("Rank","Title","Year","Content Rating","User Rating","Number of Rater","Genre","Budget","Opening Weekend USA","Gross USA","Cumulative Worldwide Gross"))